Domain: Medicine
Objective: Goal is to classify the patients into the respective labels using the attributes from their voice recordings
#import libraries that will be used for EDA
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import average_precision_score, confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix
import warnings
warnings.filterwarnings('ignore')
#Load the Parisons data set into Parks dataframe
Parks = pd.read_csv('Data - Parkinsons.csv')
#First look at the data set
Parks.head()
#SInce we have few columns truncated in above view, lets have a look at all the aolumns together
pd.options.display.max_columns = None
Parks.head()
Parks.shape
#check the shape of the data - data contains 24 features and 195 records / rows
Parks.info()
# so we can observe that there is no Null values in our data set
# Data types for all columns is fairly good because it is consistant
# We can avoid Name columnas it will won't support our model
Parks.isnull().any()
#Another way to check for NULL values
Parks.describe().T
#Check details of each columns
Parks.groupby('status').count()
#Parkisons distribution is almost 75% across entire data set
#SInce Name wont support in model building it can be deleted
Parks = Parks.drop('name', axis = 1)
Parks.shape
#REcheck the shape of the data set
Parks.hist(color='lightblue', edgecolor = 'black', alpha = 0.7, figsize = (15,20), layout=(12,3))
plt.tight_layout()
plt.show()
# from below we can observe that distribution of D2, DFA, HNR, RPDE, Sread 1 and SPread 2 are near to Normal distribution
# and other are quite skewed
#temp_park = Parks.drop('name', axis = 1)
fig, ax = plt.subplots(23,figsize=(8,50))
plt.tight_layout()
i = 0
for columns in Parks:
M1 = Parks[columns]
sns.distplot(M1, ax = ax[i])
i = i + 1
# from below we can see that distribution of MDVP:Fo(Hz),MDVP:Flo(Hz), PPE, D2, DFA, HNR, RPDE, Sread 1 and SPread 2 are near to Normal distribution
# and other are quite skewed
# For Skewness, closer the value to 0, perfectly the distribution follows normal distribution
#negative skew: The left tail is longer; the mass of the distribution is concentrated on the right of the figure.
#positive skew: The right tail is longer; the mass of the distribution is concentrated on the left of the figure.
Parks.skew()
#Lets cehck the presence of outliers in the data set
for columns in Parks:
plt.figure()
plt.title(columns)
sns.boxplot(data = Parks[columns], orient="h" , color = 'pink')
# We can observe that almost all the features except for DFA, RPDE, MDVP:Fo(Hz) have presence of outliers
sns.pairplot(Parks)
# Checking the correlation using pairplot, we can observe that due to high number of columns it gets
# difficult to understand the plot
#for correlation, closer the value to 1, higher is the corelation betwen two features
Parks.corr()
#Visual representation of above correlation
plt.figure(figsize = (20,18))
sns.heatmap(Parks.corr(), annot = True)
From above we can infer that Spread 1, spread 2, PPE have considerable correlation and many other features have positive correaltions.
ALso, MDVP:Fo, MDVP:Fhi, MDVP:Flo, HNR have negative correlation.
#Lets see distribution of good correlated featres with status
sns.scatterplot(x='spread1', y = 'spread2' , hue = 'status' , data = Parks)
# we can see that higher values of spread1 and spread 2 have more positive cases
sns.scatterplot(x='D2', y = 'PPE' , hue = 'status' , data = Parks)
# we can see that higher values of PPE and D2 have more positive cases
sns.scatterplot(x='MDVP:Shimmer', y = 'Shimmer:DDA' , hue = 'status' , data = Parks)
# we can see that positive cases have a good spread for all overall Shimmer:DDA and MDVP:Shimmer
x = Parks.drop('status', axis = 1)
y = Parks['status']
print('Shape of x and y data set is ', x.shape , y.shape, 'respctively')
# Dropping features that does nto have any correlation with status columns
x = x.drop(['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'HNR'], axis = 1)
print('Shape of x data set after dropping non correlated features', x.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.30, random_state=1)
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import average_precision_score, confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix, recall_score
#Build the model based on Gini criteria
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(x_train, y_train)
#Prediction on test data
GDTree_pred = dTree.predict(x_test)
# Let's measure the accuracy of this model's prediction
print('Accuracy of the Decision Tree model with Gini critera is', accuracy_score(y_test, GDTree_pred)*100)
print('\n\n Classification Report:')
print(metrics.classification_report(y_test, GDTree_pred))
# Plotting confusion matrix
plot_confusion_matrix(dTree,x_test,y_test)
part 1: Model predicted 10 (TN) cases correctly as people did not have parkisons and 9 (FN) cases incorrectly as people have Parkisons but actually they did not.
Part 2: Model predicted 3 (FP) cases incorrectly as people did not have parkisons but actually they did have Parkisons and 37 (TP) cases correctly as people have Parkisons
Since our objective is to classify patients into Parkisons and healthy labels, identifying correct Parkisons patients holds the highest consideration factor for choosing the Model. So we can say that success rate of this model is 93%
for k in range(1,10):
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = k, random_state=1)
dTreeR.fit(x_train, y_train)
ydT_pred = dTreeR.predict(x_test)
print('\nAccuracy for Training data with', k ,' depth tree is:' ,dTreeR.score(x_train, y_train) * 100)
print('Accuracy for Testing data with', k, ' depth treee is:' ,dTreeR.score(x_test, y_test)*100)
print('Recall score for testing data with' , k, 'depth tree is:', metrics.recall_score(y_test, ydT_pred)*100)
#We can see that DTree with depth 1 can achive test accuracy of 81.36%,
#but DTree with depth 2 gives better recall score
#so lets create a Decision tree with depth 2 and save the model for future comparision
#Build the model based on Gini criteria
GdTree = DecisionTreeClassifier(criterion = 'gini', random_state=1 , max_depth = 2 )
GdTree.fit(x_train, y_train)
#Prediction on test data
GDTree_pred = GdTree.predict(x_test)
#Lets take accuracy into one variable
DT_Accuracy = accuracy_score(y_test, GDTree_pred)*100
DT_Recallscore = recall_score(y_test, GDTree_pred)*100
# Let's measure the accuracy of this model's prediction
print('Accuracy of the Decision Tree model with Gini critera is', DT_Accuracy)
print('\n\n Classification Report:')
print(metrics.classification_report(y_test, GDTree_pred))
# Plotting confusion matrix
plot_confusion_matrix(GdTree,x_test,y_test)
part 1: Model predicted 7 (TN) cases correctly as people did not have parkisons and 12 (FN) cases incorrectly as people have Parkisons but actually they did not.
Part 2: Model predicted 2 (FP) cases incorrectly as people did not have parkisons but actually they did have Parkisons and 38 (TP) cases correctly as people have Parkisons
Since our objective is to classify patients into Parkisons and healthy labels, identifying correct Parkisons patients holds the highest consideration factor for choosing the Model. So we can say that success rate of this model is 95% which is more than the previous non regularized Dtree model with 93% recall for predicting correct Parkisons patients
#Build the model based on entropy criteria
EdTree = DecisionTreeClassifier(criterion = 'entropy', random_state=1 )
EdTree.fit(x_train, y_train)
#Prediction on test data
EDTree_pred = EdTree.predict(x_test)
# Let's measure the accuracy of this model's prediction
print('Accuracy of the Decision Tree model with Entropy critera is', accuracy_score(y_test, EDTree_pred)*100)
print('\n\n Classification Report:')
print(metrics.classification_report(y_test, EDTree_pred))
print(recall_score(y_test, EDTree_pred))
# Plotting confusion matrix
plot_confusion_matrix(EdTree,x_test,y_test)
part 1: Model predicted 10 (TN) cases correctly as people did not have parkisons and 9 (FN) cases incorrectly as people have Parkisons but actually they did not.
Part 2: Model predicted 3 (FP) cases incorrectly as people did not have parkisons but actually they did have Parkisons and 37 (TP) cases correctly as people have Parkisons
So we can say that success rate of this model is 93% for predicting correct Parkisons patients
# Apply zscore on triaing and testing data
Xtrainsc = x_train.apply(zscore)
Xtestsc = x_test.apply(zscore)
Xtrainsc.head()
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression(random_state=0)
LR_model.fit(Xtrainsc, y_train)
# Prediction on of test data
LRY_pred = LR_model.predict(Xtestsc)
#Lets take accuracy into one variable
LR_Accuracy = accuracy_score(y_test, LRY_pred)*100
LR_Recallscore = recall_score(y_test, LRY_pred)*100
# Let's measure the accuracy of this model's prediction
print('Accuracy of the Logistic regression model is', LR_Accuracy)
#We can see that models performance is increased due to scaled values
print('\n\n Classification Report:')
print(metrics.classification_report(y_test, LRY_pred))
# Plotting confusion matrix
plot_confusion_matrix(LR_model,Xtestsc,y_test)
part 1: Model predicted 11 (TN) cases correctly as people did not have parkisons and 8 (FN) cases incorrectly as people have Parkisons but actually they did not.
Part 2: Model predicted 1 (FP) cases incorrectly as people did not have parkisons but actually they did have Parkisons and 39 (TP) cases correctly as people have Parkisons
So we can say that success rate of this model is 97% which is more than the Decison tree model with 93% recall for predicting correct Parkisons patients
#Importing libraries for KNN
from sklearn.neighbors import KNeighborsClassifier
#Creating KNN Model
KNN_model = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' )
KNN_model.fit(Xtrainsc, y_train)
#Prediction on test data
KNNY_pred = KNN_model.predict(Xtestsc)
#Lets take accuracy into one variable
KNN_Accuracy = accuracy_score(y_test, KNNY_pred)*100
KNN_Recallscore = recall_score(y_test, KNNY_pred)*100
print('Accuracy of the KNN model is', KNN_Accuracy)
print('\n\n Classification Report:')
print(metrics.classification_report(y_test, KNNY_pred))
# Plotting confusion Matrix
plot_confusion_matrix(KNN_model, Xtestsc , y_test)
part 1: Model predicted 11 (TN) cases correctly as people did not have parkisons and 8 (FN) cases incorrectly as people have Parkisons but actually they did not.
Part 2: Model predicted 2 (FP) cases incorrectly as people did not have parkisons but actually they did have Parkisons and 38 (TP) cases correctly as people have Parkisons
So we can say that success rate of this model is 95% for predicting correct Parkisons patients
scores =[]
TP =[]
for k in range(1,13):
KNN_M2 = KNeighborsClassifier(n_neighbors = k, weights = 'distance' )
KNN_M2.fit(Xtrainsc, y_train)
Ysc_pred = KNN_M2.predict(Xtestsc)
scores.append(KNN_M2.score(Xtestsc,y_test))
cm = confusion_matrix(y_test,Ysc_pred)
TP.append(cm[1][1])
print('Accuracy score for KNN model with k =1 to 13\n')
print(scores)
print('\nTrue Positive predictions for KNN model on original scaled data with k =1 to 13\n')
print(TP)
# visual representation of scores for k = 1 to 13
plt.plot(range(1,13),scores)
From above we can see that higher the k value performance of the model is decreased but the TP predicitons increases, Number of True positive is highest for k_neighbors 12 and 13, so we can choose k value to 12 which gives us accuracy score of 79.6 % and correctly estimating 40 people who have Parkisons which becomes almost 100% success rate for TP predections
#Creating KNN Model
KNN_model = KNeighborsClassifier(n_neighbors= 12 , weights = 'distance' )
KNN_model.fit(Xtrainsc, y_train)
#Prediction on test data
KNNY_pred = KNN_model.predict(Xtestsc)
#Lets take accuracy into one variable
KNN_Accuracy = accuracy_score(y_test, KNNY_pred)*100
KNN_Recallscore = recall_score(y_test, KNNY_pred)*100
print('Accuracy of the KNN model is', KNN_Accuracy)
print('\n\n Classification Report:')
print(metrics.classification_report(y_test, KNNY_pred))
# Plotting confusion Matrix
plot_confusion_matrix(KNN_model, Xtestsc , y_test)
part 1: Model predicted 6 (TN) cases correctly as people did not have parkisons and 13 (FN) cases incorrectly as people have Parkisons but actually they did not.
Part 2: Model predicted 0 (FP) cases incorrectly as people did not have parkisons but actually they did have Parkisons and 40 (TP) cases correctly as people have Parkisons
we can see that although FN esitmates have increased but model predicts TP cases with success rate of 100%
#Import necessary libraries
from sklearn.naive_bayes import GaussianNB
#Build the model
NB_model = GaussianNB()
NB_model.fit(Xtrainsc, y_train)
#Prediction on test data
NBY_pred = NB_model.predict(Xtestsc)
#Lets take accuracy into one variable
NB_Accuracy = accuracy_score(y_test, NBY_pred)*100
NB_Recallscore = recall_score(y_test, NBY_pred)*100
print('Accuracy of the Naive Bayes model is', NB_Accuracy )
print("\n\nClassification Report")
print(metrics.classification_report(y_test, NBY_pred))
# Plotting confusion Matrix
plot_confusion_matrix(NB_model, Xtestsc , y_test)
part 1: Model predicted 13 (TN) cases correctly as people did not have parkisons and 6 (FN) cases incorrectly as people have Parkisons but actually they did not.
Part 2: Model predicted 14 (FP) cases incorrectly as people did not have parkisons but actually they did have Parkisons and 26 (TP) cases correctly as people have Parkisons
So we can say that success rate of this model is 65% for predicting correct Parkisons patients
#Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
#Build the model
RF_model = RandomForestClassifier(n_estimators = 50)
RF_model.fit(Xtrainsc, y_train)
#Prediction on test data
RFY_pred = RF_model.predict(Xtestsc)
#Lets take accuracy into one variable
RF_Accuracy = accuracy_score(y_test, RFY_pred)*100
RF_Recallscore = recall_score(y_test, RFY_pred)*100
print('Accuracy of the Random Forest model is', RF_Accuracy , RF_Recallscore )
print("\n\nClassification Report")
print(metrics.classification_report(y_test, RFY_pred))
# Plotting confusion Matrix
plot_confusion_matrix(RF_model, Xtestsc , y_test)
part 1: Model predicted 6 (TN) cases correctly as people did not have parkisons and 13 (FN) cases incorrectly as people have Parkisons but actually they did not.
Part 2: Model predicted 1 (FP) cases incorrectly as people did not have parkisons but actually they did have Parkisons and 39 (TP) cases correctly as people have Parkisons
So we can say that success rate of this model is 97% for predicting correct Parkisons patients
#Import necessary libraries
from mlxtend.classifier import StackingCVClassifier
#Build the meta classifier model
MC_model = StackingCVClassifier(classifiers=[LR_model, KNN_model, NB_model, RF_model, GdTree ], meta_classifier = LogisticRegression(random_state=0))
MC_model.fit(Xtrainsc,y_train)
#prediction on test data
MCY_pred = MC_model.predict(Xtestsc)
#Lets take accuracy into one variable
MC_Accuracy = accuracy_score(y_test, MCY_pred)*100
MC_Recallscore = recall_score(y_test, MCY_pred)*100
print('Accuracy of the Meta model is', MC_Accuracy )
print("\n\nClassification Report")
print(metrics.classification_report(y_test, MCY_pred))
data = { 'Model' : ['Logistic Regression' , 'KNN' , 'Naive Bayes' , 'Random Forest' , 'Decision Tree', 'Meta Classifier'] ,
'Accuracy' : [LR_Accuracy, KNN_Accuracy,NB_Accuracy,RF_Accuracy,DT_Accuracy,MC_Accuracy] ,
'Recall Score' : [LR_Recallscore, KNN_Recallscore,NB_Recallscore,RF_Recallscore,DT_Recallscore,MC_Recallscore]
}
df = pd.DataFrame(data)
df
fig, ax = plt.subplots(1,2,figsize=(15,5))
sns.barplot(x='Model' , y = 'Recall Score' , data = df,ax = ax[0] )
sns.barplot(x='Model' , y = 'Accuracy' , data = df ,ax = ax[1])
plt.rcParams['axes.grid'] = True
fig.tight_layout()
plt.show